In this lesson, we will compare procedural and declarative approaches to computing aggregate values (mean, maximum value) from time series of concentrations at a single site.
In general, you will find that an R script often follows a set of common operations:
Load libraries
library(dplyr)
library(reshape2)
library(chron)
library(ggplot2)
source("GRB001.R")
Define options
Sys.setlocale("LC_TIME","C")
## [1] "C"
options(stringsAsFactors=FALSE)
options(chron.year.abb=FALSE)
theme_set(theme_bw()) # just my preference for plots
The data is available from the National Air Pollution Monitoring Network (NABEL) of Switzerland.
We have downloaded hourly time series from 2013 for Lausanne from the NABEL data query, and placed this file in a folder called “data/2013/” located in the subdirectory of the working directory.
First, check your working directory:
getwd()
Define your input file relative to this path:
filename <- "data/2013/LAU.csv"
file.exists(filename)
## [1] TRUE
Read data table:
data <- read.table(filename,sep=";",skip=6,
col.names=c("datetime","O3","NO2","CO","PM10","TEMP","PREC","RAD"))
Check a sample of your data:
head(data)
## datetime O3 NO2 CO PM10 TEMP PREC RAD
## 1 31.12.2012 01:00 7.8 56.3 0.5 16.1 3.8 0 -2.4
## 2 31.12.2012 02:00 22.4 38.0 0.4 11.6 4.1 0 -2.3
## 3 31.12.2012 03:00 14.5 37.2 0.3 10.3 3.1 0 -2.1
## 4 31.12.2012 04:00 28.7 25.4 0.3 10.5 3.5 0 -2.2
## 5 31.12.2012 05:00 19.6 33.7 0.3 9.0 2.9 0 -2.2
## 6 31.12.2012 06:00 30.8 51.2 0.3 8.7 3.2 0 -2.3
Check the structure of your object:
str(data)
## 'data.frame': 8784 obs. of 8 variables:
## $ datetime: chr "31.12.2012 01:00" "31.12.2012 02:00" "31.12.2012 03:00" "31.12.2012 04:00" ...
## $ O3 : num 7.8 22.4 14.5 28.7 19.6 30.8 25.1 28.3 19.4 23.7 ...
## $ NO2 : num 56.3 38 37.2 25.4 33.7 51.2 51.7 44.2 60.6 53.8 ...
## $ CO : num 0.5 0.4 0.3 0.3 0.3 0.3 0.4 0.3 0.4 0.4 ...
## $ PM10 : num 16.1 11.6 10.3 10.5 9 8.7 8.6 9.7 10.2 11.2 ...
## $ TEMP : num 3.8 4.1 3.1 3.5 2.9 3.2 3.3 2.9 2.8 3.5 ...
## $ PREC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ RAD : num -2.4 -2.3 -2.1 -2.2 -2.2 ...
Check column classes:
ColClasses(data)
datetime O3 NO2 CO PM10 TEMP PREC RAD 1 character numeric numeric numeric numeric numeric numeric numeric
Convert date/time to useful data types:
data[,"datetime"] <- as.chron(data[,"datetime"], "%d.%m.%Y %H:%M")
data[,"month"] <- months(data[,"datetime"])
data[,"date"] <- dates(data[,"datetime"])
Check data sample, structure, and column classes:
head(data)
## datetime O3 NO2 CO PM10 TEMP PREC RAD month date
## 1 (12/31/2012 01:00:00) 7.8 56.3 0.5 16.1 3.8 0 -2.4 Dec 12/31/2012
## 2 (12/31/2012 02:00:00) 22.4 38.0 0.4 11.6 4.1 0 -2.3 Dec 12/31/2012
## 3 (12/31/2012 03:00:00) 14.5 37.2 0.3 10.3 3.1 0 -2.1 Dec 12/31/2012
## 4 (12/31/2012 04:00:00) 28.7 25.4 0.3 10.5 3.5 0 -2.2 Dec 12/31/2012
## 5 (12/31/2012 05:00:00) 19.6 33.7 0.3 9.0 2.9 0 -2.2 Dec 12/31/2012
## 6 (12/31/2012 06:00:00) 30.8 51.2 0.3 8.7 3.2 0 -2.3 Dec 12/31/2012
str(data)
## 'data.frame': 8784 obs. of 10 variables:
## $ datetime:Classes 'chron', 'dates', 'times' atomic [1:8784] 15705 15705 15705 15705 15705 ...
## .. ..- attr(*, "format")= Named chr [1:2] "m/d/y" "h:m:s"
## .. .. ..- attr(*, "names")= chr [1:2] "dates" "times"
## .. ..- attr(*, "origin")= Named num [1:3] 1 1 1970
## .. .. ..- attr(*, "names")= chr [1:3] "month" "day" "year"
## $ O3 : num 7.8 22.4 14.5 28.7 19.6 30.8 25.1 28.3 19.4 23.7 ...
## $ NO2 : num 56.3 38 37.2 25.4 33.7 51.2 51.7 44.2 60.6 53.8 ...
## $ CO : num 0.5 0.4 0.3 0.3 0.3 0.3 0.4 0.3 0.4 0.4 ...
## $ PM10 : num 16.1 11.6 10.3 10.5 9 8.7 8.6 9.7 10.2 11.2 ...
## $ TEMP : num 3.8 4.1 3.1 3.5 2.9 3.2 3.3 2.9 2.8 3.5 ...
## $ PREC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ RAD : num -2.4 -2.3 -2.1 -2.2 -2.2 ...
## $ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 12 12 12 12 12 12 12 12 12 12 ...
## $ date :Classes 'dates', 'times' atomic [1:8784] 15705 15705 15705 15705 15705 ...
## .. ..- attr(*, "format")= Named chr [1:2] "m/d/y" "h:m:s"
## .. .. ..- attr(*, "names")= chr [1:2] "dates" "times"
## .. ..- attr(*, "origin")= Named num [1:3] 1 1 1970
## .. .. ..- attr(*, "names")= chr [1:3] "month" "day" "year"
ColClasses(data)
datetime O3 NO2 CO PM10 TEMP PREC
1 chron,dates,times numeric numeric numeric numeric numeric numeric RAD month date 1 numeric ordered,factor dates,times
View raw ozone concentrations:
ggp <- ggplot(data)+
geom_line(aes(datetime, O3))+
scale_x_chron()
print(ggp)
Solve by looping. Note that we “grow” a data frame by the function rbind.
unique.months <- levels(data[,"month"])
O3.monthly <- NULL
for(.month in unique.months) {
table <- data %>% filter(month == .month)
tmp <- data.frame(month=.month, O3=mean(table[,"O3"], na.rm=TRUE))
O3.monthly <- rbind(O3.monthly, tmp)
}
print(O3.monthly)
## month O3
## 1 Jan 22.62880
## 2 Feb 40.58125
## 3 Mar 35.02078
## 4 Apr 50.78187
## 5 May 51.85189
## 6 Jun 59.61657
## 7 Jul 76.19515
## 8 Aug 66.49771
## 9 Sep 43.20181
## 10 Oct 24.74307
## 11 Nov 25.85577
## 12 Dec 18.66697
Convert month column from character string to “factor”:
class(O3.monthly[,"month"])
## [1] "character"
O3.monthly[,"month"] <- factor(O3.monthly[,"month"], unique.months)
class(O3.monthly[,"month"])
## [1] "factor"
Another visual representation:
ggp <- ggplot(O3.monthly) +
geom_bar(aes(month, O3), stat="identity")
print(ggp)
Calculate:
unique.dates <- unique(data[,"date"])
O3.dailymax <- NULL
for(.date in unique.dates) {
table <- data %>% filter(date == .date)
tmp <- data.frame(date=.date, O3=max(table[,"O3"], na.rm=TRUE))
O3.dailymax <- rbind(O3.dailymax, tmp)
}
head(O3.dailymax)
## date O3
## 1 15705 47.9
## 2 15706 61.2
## 3 15707 70.4
## 4 15708 47.9
## 5 15709 22.9
## 6 15710 36.8
Convert date column to chron object:
class(O3.dailymax[,"date"])
## [1] "numeric"
O3.dailymax[,"date"] <- as.chron(O3.dailymax[,"date"])
class(O3.dailymax[,"date"])
## [1] "dates" "times"
Inspect:
head(O3.dailymax)
## date O3
## 1 12/31/2012 47.9
## 2 01/01/2013 61.2
## 3 01/02/2013 70.4
## 4 01/03/2013 47.9
## 5 01/04/2013 22.9
## 6 01/05/2013 36.8
tail(O3.dailymax)
## date O3
## 362 12/27/2013 42.0
## 363 12/28/2013 62.6
## 364 12/29/2013 52.2
## 365 12/30/2013 39.0
## 366 12/31/2013 24.1
## 367 01/01/2014 7.9
Plot ECDF (empirical cumulative distribution function):
ggp <- ggplot(O3.dailymax) +
geom_line(aes(O3),stat="ecdf") +
labs(y = "ECDF")
print(ggp)
With a single expression, we reproduced the loop used to create O3.dailymax:
data %>% group_by(month) %>%
summarize(O3 = mean(O3, na.rm=TRUE))
## Source: local data frame [12 x 2]
##
## month O3
## (fctr) (dbl)
## 1 Jan 22.62880
## 2 Feb 40.58125
## 3 Mar 35.02078
## 4 Apr 50.78187
## 5 May 51.85189
## 6 Jun 59.61657
## 7 Jul 76.19515
## 8 Aug 66.49771
## 9 Sep 43.20181
## 10 Oct 24.74307
## 11 Nov 25.85577
## 12 Dec 18.66697
We can easily extend to two variables:
data %>% group_by(month) %>%
summarize(O3 = mean(O3, na.rm=TRUE),
NO2 = mean(NO2, na.rm=TRUE))
## Source: local data frame [12 x 3]
##
## month O3 NO2
## (fctr) (dbl) (dbl)
## 1 Jan 22.62880 47.52709
## 2 Feb 40.58125 45.05580
## 3 Mar 35.02078 50.33841
## 4 Apr 50.78187 40.48799
## 5 May 51.85189 38.63450
## 6 Jun 59.61657 38.92350
## 7 Jul 76.19515 38.44542
## 8 Aug 66.49771 34.13576
## 9 Sep 43.20181 42.02855
## 10 Oct 24.74307 40.52557
## 11 Nov 25.85577 36.38623
## 12 Dec 18.66697 56.98877
We first transform the data frame:
lf <- melt(data, id.vars=c("datetime", "month", "date"))
Let us inspect this transformation:
head(lf)
## datetime month date variable value
## 1 (12/31/2012 01:00:00) Dec 12/31/2012 O3 7.8
## 2 (12/31/2012 02:00:00) Dec 12/31/2012 O3 22.4
## 3 (12/31/2012 03:00:00) Dec 12/31/2012 O3 14.5
## 4 (12/31/2012 04:00:00) Dec 12/31/2012 O3 28.7
## 5 (12/31/2012 05:00:00) Dec 12/31/2012 O3 19.6
## 6 (12/31/2012 06:00:00) Dec 12/31/2012 O3 30.8
tail(lf)
## datetime month date variable value
## 61483 (12/31/2013 19:00:00) Dec 12/31/2013 RAD -2.4
## 61484 (12/31/2013 20:00:00) Dec 12/31/2013 RAD -2.4
## 61485 (12/31/2013 21:00:00) Dec 12/31/2013 RAD -2.3
## 61486 (12/31/2013 22:00:00) Dec 12/31/2013 RAD -2.2
## 61487 (12/31/2013 23:00:00) Dec 12/31/2013 RAD -1.6
## 61488 (01/01/2014 00:00:00) Jan 01/01/2014 RAD -1.3
ColClasses(lf)
datetime month date variable value
1 chron,dates,times ordered,factor dates,times factor numeric
This is amenable for plotting:
ggp <- ggplot(lf) +
geom_line(aes(datetime, value))+
facet_grid(variable~., scale="free_y") +
scale_x_chron()
print(ggp)
Using this, we can aggregate using three approaches:
group_by: as illustrated abovedcast: aggregate statistics through pivotingstat_summary: called through geom operationgroup_by operationresult <- lf %>% group_by(month, variable) %>%
summarize(value = mean(value, na.rm=TRUE))
The inverse opration of melt:
dcast(result, month~variable)
## month O3 NO2 CO PM10 TEMP PREC
## 1 Jan 22.62880 47.52709 0.4439516 22.55780 2.2280537 0.08709677
## 2 Feb 40.58125 45.05580 0.4380952 28.74501 0.6016369 0.07113095
## 3 Mar 35.02078 50.33841 0.4849328 35.95249 4.7498656 0.10713324
## 4 Apr 50.78187 40.48799 0.3930556 25.55132 10.6093056 0.11930556
## 5 May 51.85189 38.63450 0.3339166 12.29838 11.9751344 0.18561828
## 6 Jun 59.61657 38.92350 0.3386111 17.55225 17.6750000 0.12517385
## 7 Jul 76.19515 38.44542 0.3680108 19.92781 22.3884409 0.16424731
## 8 Aug 66.49771 34.13576 0.3336022 17.26532 20.9435484 0.05397039
## 9 Sep 43.20181 42.02855 0.3945833 17.52681 17.0044444 0.12472222
## 10 Oct 24.74307 40.52557 0.4131720 17.46508 13.7291667 0.28721400
## 11 Nov 25.85577 36.38623 0.3695833 13.65309 6.0034722 0.15724234
## 12 Dec 18.66697 56.98877 0.5250326 25.64837 3.8698827 0.13168188
## RAD
## 1 46.25383
## 2 85.00015
## 3 98.91492
## 4 170.54847
## 5 174.21317
## 6 248.54292
## 7 272.65753
## 8 241.67258
## 9 157.18375
## 10 82.58642
## 11 53.88542
## 12 46.74668
The mean can also be calculated directly:
dcast(lf, month~variable, fun.aggregate=mean, na.rm=TRUE)
## month O3 NO2 CO PM10 TEMP PREC
## 1 Jan 22.62880 47.52709 0.4439516 22.55780 2.2280537 0.08709677
## 2 Feb 40.58125 45.05580 0.4380952 28.74501 0.6016369 0.07113095
## 3 Mar 35.02078 50.33841 0.4849328 35.95249 4.7498656 0.10713324
## 4 Apr 50.78187 40.48799 0.3930556 25.55132 10.6093056 0.11930556
## 5 May 51.85189 38.63450 0.3339166 12.29838 11.9751344 0.18561828
## 6 Jun 59.61657 38.92350 0.3386111 17.55225 17.6750000 0.12517385
## 7 Jul 76.19515 38.44542 0.3680108 19.92781 22.3884409 0.16424731
## 8 Aug 66.49771 34.13576 0.3336022 17.26532 20.9435484 0.05397039
## 9 Sep 43.20181 42.02855 0.3945833 17.52681 17.0044444 0.12472222
## 10 Oct 24.74307 40.52557 0.4131720 17.46508 13.7291667 0.28721400
## 11 Nov 25.85577 36.38623 0.3695833 13.65309 6.0034722 0.15724234
## 12 Dec 18.66697 56.98877 0.5250326 25.64837 3.8698827 0.13168188
## RAD
## 1 46.25383
## 2 85.00015
## 3 98.91492
## 4 170.54847
## 5 174.21317
## 6 248.54292
## 7 272.65753
## 8 241.67258
## 9 157.18375
## 10 82.58642
## 11 53.88542
## 12 46.74668
stat_summaryThe mean cal also be calculated in the process of plotting:
ggp <- ggplot(lf) +
geom_bar(aes(month, value), stat="summary", fun.y="mean")+
facet_grid(variable~., scale="free_y")
print(ggp)
Add errorbars to denote the full range in values:
ggp <- ggplot(lf, aes(month, value)) +
geom_bar(stat="summary", fun.y="mean")+
geom_errorbar(stat="summary",
fun.ymin=min, #function(x) quantile(x, .25),
fun.ymax=max, #function(x) quantile(x, .75))+
width=0.1) +
facet_grid(variable~., scale="free_y")
print(ggp)